Data anlysis with COVID-19 data from Brazil until May/2021

Data obtained from website Brasil.io - https://brasil.io/dataset/covid19/files/

Hiago W. Petris - 22/05/2021

Download at: https://www.kaggle.com/hiagow/dados-covid-brasil-abril2021/

setwd("C:/Users/hiago/OneDrive/Projetos/Analise-Dados-Covid/")

library(tidyverse) # metapackage of all tidyverse packages
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
library(dplyr)

arq = "C:/Users/hiago/Downloads/caso.csv"
df = read_csv(arq)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   date = col_date(format = ""),
##   state = col_character(),
##   city = col_character(),
##   place_type = col_character(),
##   confirmed = col_double(),
##   deaths = col_double(),
##   order_for_place = col_double(),
##   is_last = col_logical(),
##   estimated_population_2019 = col_double(),
##   estimated_population = col_double(),
##   city_ibge_code = col_double(),
##   confirmed_per_100k_inhabitants = col_double(),
##   death_rate = col_double()
## )
# View(df)
head(df)
## # A tibble: 6 x 13
##   date       state city  place_type confirmed deaths order_for_place is_last
##   <date>     <chr> <chr> <chr>          <dbl>  <dbl>           <dbl> <lgl>  
## 1 2021-05-21 AP    <NA>  state         109906   1654             424 TRUE   
## 2 2021-05-20 AP    <NA>  state         109777   1645             423 FALSE  
## 3 2021-05-19 AP    <NA>  state         109625   1635             422 FALSE  
## 4 2021-05-18 AP    <NA>  state         109479   1628             421 FALSE  
## 5 2021-05-17 AP    <NA>  state         109272   1622             420 FALSE  
## 6 2021-05-16 AP    <NA>  state         109070   1615             419 FALSE  
## # ... with 5 more variables: estimated_population_2019 <dbl>,
## #   estimated_population <dbl>, city_ibge_code <dbl>,
## #   confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
str(df)
## spec_tbl_df[,13] [1,970,810 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date                          : Date[1:1970810], format: "2021-05-21" "2021-05-20" ...
##  $ state                         : chr [1:1970810] "AP" "AP" "AP" "AP" ...
##  $ city                          : chr [1:1970810] NA NA NA NA ...
##  $ place_type                    : chr [1:1970810] "state" "state" "state" "state" ...
##  $ confirmed                     : num [1:1970810] 109906 109777 109625 109479 109272 ...
##  $ deaths                        : num [1:1970810] 1654 1645 1635 1628 1622 ...
##  $ order_for_place               : num [1:1970810] 424 423 422 421 420 419 418 417 416 415 ...
##  $ is_last                       : logi [1:1970810] TRUE FALSE FALSE FALSE FALSE FALSE ...
##  $ estimated_population_2019     : num [1:1970810] 845731 845731 845731 845731 845731 ...
##  $ estimated_population          : num [1:1970810] 861773 861773 861773 861773 861773 ...
##  $ city_ibge_code                : num [1:1970810] 16 16 16 16 16 16 16 16 16 16 ...
##  $ confirmed_per_100k_inhabitants: num [1:1970810] 12753 12739 12721 12704 12680 ...
##  $ death_rate                    : num [1:1970810] 0.015 0.015 0.0149 0.0149 0.0148 0.0148 0.0148 0.0148 0.0148 0.0148 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   state = col_character(),
##   ..   city = col_character(),
##   ..   place_type = col_character(),
##   ..   confirmed = col_double(),
##   ..   deaths = col_double(),
##   ..   order_for_place = col_double(),
##   ..   is_last = col_logical(),
##   ..   estimated_population_2019 = col_double(),
##   ..   estimated_population = col_double(),
##   ..   city_ibge_code = col_double(),
##   ..   confirmed_per_100k_inhabitants = col_double(),
##   ..   death_rate = col_double()
##   .. )

First part: Only data from state of Paraná

dfPR = df %>% 
  arrange(date) %>%
  filter(state=='PR', place_type=='state')
head(dfPR)
## # A tibble: 6 x 13
##   date       state city  place_type confirmed deaths order_for_place is_last
##   <date>     <chr> <chr> <chr>          <dbl>  <dbl>           <dbl> <lgl>  
## 1 2020-03-12 PR    <NA>  state              6      0               1 FALSE  
## 2 2020-03-13 PR    <NA>  state              6      0               2 FALSE  
## 3 2020-03-16 PR    <NA>  state              6      0               3 FALSE  
## 4 2020-03-17 PR    <NA>  state             12      0               4 FALSE  
## 5 2020-03-18 PR    <NA>  state             14      0               5 FALSE  
## 6 2020-03-19 PR    <NA>  state             23      0               6 FALSE  
## # ... with 5 more variables: estimated_population_2019 <dbl>,
## #   estimated_population <dbl>, city_ibge_code <dbl>,
## #   confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
nrow(dfPR)
## [1] 434
#View(dfPR)

Cumulative cases at Paraná

figCumulativeCases <- plot_ly(dfPR,x=~date,y=~confirmed,type='scatter',name='Cumulative cases',mode='lines')
figCumulativeCases <- figCumulativeCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='Cumulative cases'))
figCumulativeCases <- figCumulativeCases %>% config(locale="pt-br")
figCumulativeCases

Cumulative cases and deaths

figCumulativeCasesAndDeaths <- figCumulativeCases %>% add_trace(y=~deaths,name='Cumulative deaths',mode='lines')
figCumulativeCasesAndDeaths

Death rate

figDeathRate <- plot_ly(dfPR,x=~date,y=~death_rate,type='scatter',name='Death rate',mode='lines')
figDeathRate <- figDeathRate %>% layout(xaxis=list(title='Date'),yaxis=list(title='Death rate'))
figDeathRate

New cases per day

# ?dplyr::lag
dfPRNewCases = dfPR

dfPRNewCases = dfPRNewCases %>%
  mutate(new_cases = confirmed-lag(confirmed), .after=confirmed)

dfPRNewCases$new_cases[1] = dfPRNewCases$confirmed[1]

head(dfPRNewCases)
## # A tibble: 6 x 14
##   date       state city  place_type confirmed new_cases deaths order_for_place
##   <date>     <chr> <chr> <chr>          <dbl>     <dbl>  <dbl>           <dbl>
## 1 2020-03-12 PR    <NA>  state              6         6      0               1
## 2 2020-03-13 PR    <NA>  state              6         0      0               2
## 3 2020-03-16 PR    <NA>  state              6         0      0               3
## 4 2020-03-17 PR    <NA>  state             12         6      0               4
## 5 2020-03-18 PR    <NA>  state             14         2      0               5
## 6 2020-03-19 PR    <NA>  state             23         9      0               6
## # ... with 6 more variables: is_last <lgl>, estimated_population_2019 <dbl>,
## #   estimated_population <dbl>, city_ibge_code <dbl>,
## #   confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
#View(dfPRNewCases)

New cases

figNewCases <- plot_ly(dfPRNewCases,x=~date,y=~new_cases,type='scatter',name='New Cases',mode='lines')
figNewCases <- figNewCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='New Cases'))
figNewCases <- figNewCases %>% config(locale="pt-br")
figNewCases

Analysis of the impact of a restriction measure, getting 14 days before and 14 days after the measure

dfPRRestrictionMeasure = dfPRNewCases %>%
  filter(date>=as.Date('2021-03-31')-14, date<=as.Date('2021-04-05')+14) %>%
  select(date, new_cases, death_rate)

head(dfPRRestrictionMeasure)
## # A tibble: 6 x 3
##   date       new_cases death_rate
##   <date>         <dbl>      <dbl>
## 1 2021-03-17      5461     0.0183
## 2 2021-03-18      6469     0.0184
## 3 2021-03-19      8146     0.0186
## 4 2021-03-20      4758     0.0187
## 5 2021-03-21      2146     0.0187
## 6 2021-03-22      3224     0.0187
# View(dfPRRestrictionMeasure)

figRestrictionMeasure <- plot_ly(dfPRRestrictionMeasure,x=~date,y=~new_cases,type='scatter',name='New Cases',mode='lines')
figRestrictionMeasure <- figRestrictionMeasure %>% layout(xaxis=list(title='Date'),yaxis=list(title='New Cases'))
figRestrictionMeasure <- figRestrictionMeasure %>% config(locale="pt-br")

figRestrictionMeasure <- figRestrictionMeasure %>% add_segments(
  name="Restriction Measure", line=list(color="orange"), 
  x=as.Date('2021-03-31'), xend=as.Date('2021-03-31'),
  y=min(dfPRRestrictionMeasure$new_cases), yend=max(dfPRRestrictionMeasure$new_cases))

figRestrictionMeasure <- figRestrictionMeasure %>% layout(
  shapes=list(
    type="rect",fillcolor="orange", line=list(color='orange'), opacity=0.3, 
    x0=as.Date('2021-03-31'), x1=as.Date('2021-04-05'),
    y0=min(dfPRRestrictionMeasure$new_cases),y1=max(dfPRRestrictionMeasure$new_cases)
  )
)

figRestrictionMeasure <- figRestrictionMeasure %>% add_segments(
  name="Relax of restriction measure", line=list(color="green"), 
  x=as.Date('2021-04-05'), xend=as.Date('2021-04-05'),
  y=min(dfPRRestrictionMeasure$new_cases),yend=max(dfPRRestrictionMeasure$new_cases))

figRestrictionMeasure

Same analysis with Death reate

figRestrictionMeasure2 <- plot_ly(dfPRRestrictionMeasure,x=~date,y=~death_rate,type='scatter',name='Death rate',mode='lines')
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% layout(xaxis=list(title='Date'),yaxis=list(title='Death rate'))
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% config(locale="pt-br")

figRestrictionMeasure2 <- figRestrictionMeasure2 %>% add_segments(
  name="Medida de restrição", line=list(color="orange"), 
  x=as.Date('2021-03-31'), xend=as.Date('2021-03-31'), 
  y=min(dfPRRestrictionMeasure$death_rate),yend=max(dfPRRestrictionMeasure$death_rate))

figRestrictionMeasure2 <- figRestrictionMeasure2 %>% layout(
  shapes=list(
    type="rect",fillcolor="orange", line=list(color='orange'), opacity=0.3, 
    x0=as.Date('2021-03-31'), x1=as.Date('2021-04-05'), 
    y0=min(dfPRRestrictionMeasure$death_rate),y1=max(dfPRRestrictionMeasure$death_rate)
  )
)

figRestrictionMeasure2 <- figRestrictionMeasure2 %>% add_segments(name="Relax of restriction measure", line=list(color="green"), x=as.Date('2021-04-05'), xend=as.Date('2021-04-05'),y=min(dfPRRestrictionMeasure$death_rate),yend=max(dfPRRestrictionMeasure$death_rate))
figRestrictionMeasure2

Contamination rate

  • Calculate contamination rate = new cases / new cases from a day before
dfPRContaminationRate = dfPRNewCases %>%
  mutate(contamination_rate = round(confirmed/lag(confirmed),2), .after=new_cases) %>%
  select(date,confirmed,contamination_rate)

dfPRContaminationRate$contamination_rate[1] = 0

head(dfPRContaminationRate)
## # A tibble: 6 x 3
##   date       confirmed contamination_rate
##   <date>         <dbl>              <dbl>
## 1 2020-03-12         6               0   
## 2 2020-03-13         6               1   
## 3 2020-03-16         6               1   
## 4 2020-03-17        12               2   
## 5 2020-03-18        14               1.17
## 6 2020-03-19        23               1.64
# View(dfPRContaminationRate)

Death rate

  • Change the scale of Y axos or apply some normalization could help in visualization, beacause it starts on 0 and than has a peak at 2, but then it keeps approximately at 1
figDeathRate <- plot_ly(dfPRContaminationRate,x=~date,y=~contamination_rate,type='scatter',name='Contamination Rate',mode='lines')
figDeathRate <- figDeathRate %>% layout(xaxis=list(title='Date'),yaxis=list(title='Contamination Rate'))
figDeathRate <- figDeathRate %>% config(locale="pt-br")
figDeathRate

Deaths / New Cases

  • Filter because it has and outlier
dfPRDeathsPerNewcases = dfPRNewCases %>%
  mutate(deaths_per_new_cases = ifelse(new_cases==0, deaths, round(deaths/new_cases,2)), .after=new_cases) %>%
  select(date,deaths,new_cases,deaths_per_new_cases) %>%
  filter(deaths_per_new_cases < 1000)

head(dfPRDeathsPerNewcases)
## # A tibble: 6 x 4
##   date       deaths new_cases deaths_per_new_cases
##   <date>      <dbl>     <dbl>                <dbl>
## 1 2020-03-12      0         6                    0
## 2 2020-03-13      0         0                    0
## 3 2020-03-16      0         0                    0
## 4 2020-03-17      0         6                    0
## 5 2020-03-18      0         2                    0
## 6 2020-03-19      0         9                    0
figDeathsNewCases <- plot_ly(dfPRDeathsPerNewcases,x=~date,y=~deaths_per_new_cases,type='scatter',name='Deaths/New Cases',mode='lines')
figDeathsNewCases <- figDeathsNewCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='Deaths/New Cases'))
figDeathsNewCases <- figDeathsNewCases %>% config(locale="pt-br")
figDeathsNewCases
# TODO: LM new cases per death
#ggplot(dfPRNewCases,aes(x=estimated_population,y=contamination_rate)) +
# geom_point() +
#stat_smooth(method="lm", col="red")

Data from Brasil Cities

# Creates dataframe with contamination rate and estimated population, only for cities

dfContaminationRatePopulation = df %>% 
  filter(place_type=='city') %>%
  select(date,city,state,estimated_population,confirmed) %>%
  arrange(city,state,date) %>%
  mutate(contamination_rate = confirmed/lag(confirmed), .before=confirmed)

# Removes column confirmed.
# dfContaminationRatePopulation$confirmed = NULL

# Removes NA values
dfContaminationRatePopulation = dfContaminationRatePopulation %>%
  filter(!is.na(contamination_rate) & !is.na(estimated_population))


# Calculates the mean of contamination rate for every city and the max value of estimated population
dfContaminationRatePopulation2 = dfContaminationRatePopulation %>%
  group_by(city,state) %>%
  summarise(meanContaminationRate = mean(contamination_rate), estimated_population = max(estimated_population)) %>%
  filter(!is.infinite(meanContaminationRate))
## `summarise()` has grouped output by 'city'. You can override using the `.groups` argument.
head(dfContaminationRatePopulation2)
## # A tibble: 6 x 4
## # Groups:   city [6]
##   city                state meanContaminationRate estimated_population
##   <chr>               <chr>                 <dbl>                <dbl>
## 1 Abadia de Goiás     GO                     1.02                 8958
## 2 Abadia dos Dourados MG                     1.02                 7006
## 3 Abadiânia           GO                     1.02                20461
## 4 Abaeté              MG                     1.02                23250
## 5 Abaetetuba          PA                     1.03               159080
## 6 Abaíra              BA                     1.02                 8710
ggplot(dfContaminationRatePopulation2, aes(x=meanContaminationRate,y=estimated_population)) + 
  geom_point()

# There are cities with low population and high contamination rate
# But no city with high population has contamination rate >  1.05

# Check for Cities with higher population (Sao Paulo and Rio) or higher contamination rate
dfContaminationRatePopulation2 %>% 
  filter(estimated_population > 4.0e+06 | meanContaminationRate > 1.5)
## # A tibble: 4 x 4
## # Groups:   city [4]
##   city                   state meanContaminationRate estimated_population
##   <chr>                  <chr>                 <dbl>                <dbl>
## 1 Rio de Janeiro         RJ                     1.04              6747815
## 2 Santa Cruz do Sul      RS                     1.64               131365
## 3 Santa Margarida do Sul RS                     2.55                 2578
## 4 São Paulo              SP                     1.04             12325232
# Filter for "Santa Margarida do Sul" in the original dataframe
View(df %>% filter(city=='Santa Margarida do Sul') %>% arrange(date) %>% filter(confirmed>23000))

# Apparently it has an error on day 2021-04-04, with 23326 confirmed cases

Analyse distribution of the contamination rate means

# Filter by meanContaminationRete < 1.05 to facilitate visualization
dfContaminationRatePopulation3 = dfContaminationRatePopulation2 %>% 
  filter(meanContaminationRate < 1.05) %>% 
  arrange(meanContaminationRate)

mean(dfContaminationRatePopulation3$meanContaminationRate)
## [1] 1.020475
median(dfContaminationRatePopulation3$meanContaminationRate)
## [1] 1.019825
quantile(dfContaminationRatePopulation3$meanContaminationRate)
##       0%      25%      50%      75%     100% 
## 1.002164 1.016944 1.019825 1.023171 1.049820
ggplot(dfContaminationRatePopulation3, aes(x=meanContaminationRate)) + geom_density()